data = read.csv('covid_vals.csv',header = TRUE)
x = data %>%
  select(!date) %>%
  as.matrix()
xs = as(x,"Incomplete")
xsc = biScale(xs, col.scale = FALSE, row.scale = FALSE)
fit = softImpute(xsc, type ='svd', rank.max = 20, trace.it = FALSE)
dataimp = complete(xsc, fit)
pr.out <- prcomp(dataimp)
pve <- pr.out$sdev^2/sum(pr.out$sdev^2)
plot(pve, ylab='Proportion of Variance Explained', xlab ='Principal Component', main='Variance by PC',xlim = c(1,20))

pve <- data.frame(var = pr.out$sdev^2/sum(pr.out$sdev^2))
pve$id <- as.integer(row.names(pve))

p1 <- ggplot(pve, aes(x=id, y=var)) +
      geom_point()+
      geom_line()+
      labs(x='Principal component r', 
           y='Proportion variance explained', 
           title='Each')

p2 <- ggplot(pve, aes(x=id, y=cumsum(var)))+
      geom_point()+geom_line()+
      labs(x='Principal Components 1:r',
           y='',
           title='Cumulative Sum')

grid.arrange(p1, p2, ncol=2)

pve <- data.frame(var = pr.out$sdev^2/sum(pr.out$sdev^2))
pve$id <- as.integer(row.names(pve))

p1 <- ggplot(pve, aes(x=id, y=var)) +
      geom_point()+
      geom_line()+
      labs(x='Principal component r', 
           y='Proportion variance explained', 
           title='Each')+
      xlim(0,200)

p2 <- ggplot(pve, aes(x=id, y=cumsum(var)))+
      geom_point()+geom_line()+
      labs(x='Principal Components 1:r',
           y='',
           title='Cumulative Sum')+
      xlim(0,200)

grid.arrange(p1, p2, ncol=2)
## Warning: Removed 2735 rows containing missing values (geom_point).
## Warning: Removed 2735 row(s) containing missing values (geom_path).
## Warning: Removed 2735 rows containing missing values (geom_point).
## Warning: Removed 2735 row(s) containing missing values (geom_path).

pr.out$rotation[1:20,1:3]
##                PC1          PC2          PC3
##  [1,] 0.0008034089 0.0005204555 0.0009574780
##  [2,] 0.0040094261 0.0023439641 0.0047593976
##  [3,] 0.0095078810 0.0069030066 0.0087894265
##  [4,] 0.0029965171 0.0020496035 0.0025369331
##  [5,] 0.0033840026 0.0019709162 0.0013207289
##  [6,] 0.0037922622 0.0019607093 0.0004950399
##  [7,] 0.0117435120 0.0062163718 0.0014560769
##  [8,] 0.0134139753 0.0061813932 0.0013796148
##  [9,] 0.0202931144 0.0085948981 0.0027404208
## [10,] 0.0204477527 0.0095891436 0.0033941669
## [11,] 0.0215000508 0.0107517194 0.0031307775
## [12,] 0.0060214355 0.0031862488 0.0012389615
## [13,] 0.0262182743 0.0131155446 0.0048166882
## [14,] 0.0083219311 0.0043636948 0.0030876212
## [15,] 0.0179170187 0.0094022711 0.0070804777
## [16,] 0.0187695374 0.0105245927 0.0078685376
## [17,] 0.0404040324 0.0207316866 0.0124757878
## [18,] 0.0409628471 0.0210745889 0.0098745064
## [19,] 0.0209603195 0.0108717999 0.0051269201
## [20,] 0.0209779175 0.0109401546 0.0049623020
sum(pve$var[1:20])
## [1] 0.7375811
sum(pve$var[1:50])
## [1] 0.8550725
sum(pve$var[1:200])
## [1] 0.9548083
sum(pve$var[1:1000])
## [1] 0.9951515
pca <- data.frame(pr.out$x)
pca_km <- pca[,1:50]
set.seed(2)
km_out <- kmeans(pca_km, 4, nstart = 20)
ggplot(data = pca_km, mapping = aes(x = PC1, y = PC2, col = as.factor(km_out$cluster))) +
  geom_point()+
  labs(col = 'clusters')

fig = plot_ly()
fig = fig %>% add_markers(data = pca_km, x = ~PC1, y = ~PC2, z = ~PC3, color = ~as.factor(km_out$cluster), showlegend=FALSE)
fig = fig %>% layout(scene = list(xaxis = list(title = 'PC1'),
                     yaxis = list(title = 'PC2'),
                     zaxis = list(title = 'PC3')))
fig
pca <- data.frame(pr.out$x)
pca_km <- pca[,1:50]
set.seed(2)
km_out <- kmeans(pca_km, 3, nstart = 20)
ggplot(data = pca_km, mapping = aes(x = PC1, y = PC2, col = as.factor(km_out$cluster))) +
  geom_point()+
  labs(col = 'clusters')

fig = plot_ly()
fig = fig %>% add_markers(data = pca_km, x = ~PC1, y = ~PC2, z = ~PC3, color = ~as.factor(km_out$cluster), showlegend=FALSE)
fig = fig %>% layout(scene = list(xaxis = list(title = 'PC1'),
                     yaxis = list(title = 'PC2'),
                     zaxis = list(title = 'PC3')))
fig
pca <- data.frame(pr.out$x)
pca_km <- pca[,1:50]
set.seed(2)
km_out <- kmeans(pca_km, 8, nstart = 20)
ggplot(data = pca_km, mapping = aes(x = PC1, y = PC2, col = as.factor(km_out$cluster))) +
  geom_point()+
  labs(col = 'clusters')

fig = plot_ly()
fig = fig %>% add_markers(data = pca_km, x = ~PC1, y = ~PC2, z = ~PC3, color = ~as.factor(km_out$cluster), showlegend=FALSE)
fig = fig %>% layout(scene = list(xaxis = list(title = 'PC1'),
                     yaxis = list(title = 'PC2'),
                     zaxis = list(title = 'PC3')))
fig
pca <- data.frame(pr.out$x)
pca_km <- pca[,1:50]
set.seed(2)
km_out <- kmeans(pca_km, 5, nstart = 20)
ggplot(data = pca_km, mapping = aes(x = PC1, y = PC2, col = as.factor(km_out$cluster))) +
  geom_point()+
  labs(col = 'clusters')

fig = plot_ly()
fig = fig %>% add_markers(data = pca_km, x = ~PC1, y = ~PC2, z = ~PC3, color = ~as.factor(km_out$cluster), showlegend=FALSE)
fig = fig %>% layout(scene = list(xaxis = list(title = 'PC1'),
                     yaxis = list(title = 'PC2'),
                     zaxis = list(title = 'PC3')))
fig